In this note, we generate the input files for GSEA to conduct gene set enrichment analysis of the identified genes differentially expressed between given groups of patients.
We generate input files for the differential expressed genes between nonacute stage I untreated vs all other groups based on Laura’s observations.
varnames<-c("nodule_lymph_pheno","steroid_atv1","dmard_atv1")
my_gsea_filecreate_binary(fpkm.filepath=fpkm.file,
clinic.filepath = clinic.file,
var.names = varnames,
group1.values = data.frame(nodule_lymph_pheno=c("lymph"),steroid_atv1=c(" 0"),dmard_atv1=c(" 0")),
group2.values = data.frame(nodule_lymph_pheno=c("micronodule","both"),steroid_atv1=c(" 0"," 0"),dmard_atv1=c(" 0"," 0")),
output.dir=output.folder,
suffix.name="analysis4"
)There are 43 and 75 samples for group1 and group2, respectively.
[1] 0
We first generate file containing both the clinical matrix and CT scan features so that both group 1 and group 2 in the GSEA analysis can be defined based on combinations of clinical triats and CT scan data.
# These codes only need to be run once
# load in the clinical matrix
clinic.filepath<-"/home/yanxiting/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data/clinic_matrix_merged.RDS"
clinic.matrix<-readRDS(clinic.filepath,refhook = NULL)
# load in the CT scan data
ct.filepath<-file.path(home.dir,"scratch/GRADS/SARC_results/Data/CT_data_20170712/Sarc_ct_reads_corrected.xls")
ct.matrix<-read.xls(ct.filepath,sheet=1)
rownames(ct.matrix)<-as.matrix(ct.matrix)[,"GRADSID"]
ct.matrix<-ct.matrix[,c("Med_Lymphadenopathy","Hilar_Lymphadenopathy","Micronodule","Bronchial_Wall_Thickening","Traction_Bronchiectasis","Bronchiectasis_severity","Ground_Glass","Honeycombing","Reticular_Abnormality","Pulmonary_Art","Tree_in_bud")]
# load in the fpkm matrix
fpkm.file=file.path(home.dir,"scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data_adjusted2","DESeq2_normalized_276_clean_log2_celldiffadjusted_withannot.txt")
fpkm.matrix<-read.table(fpkm.file,sep="\t",header=T,check.names=F,as.is=TRUE,comment.char = "")
fpkm.matrix.anno<-fpkm.matrix[,1:6]
fpkm.matrix<-fpkm.matrix[,7:ncol(fpkm.matrix)]
# merge the two data
clinic.matrix<-clinic.matrix[colnames(fpkm.matrix),]
ct.matrix<-ct.matrix[colnames(fpkm.matrix),]
rownames(clinic.matrix)<-colnames(fpkm.matrix)
rownames(ct.matrix)<-colnames(fpkm.matrix)
cmd.out<-cbind(clinic.matrix,ct.matrix)
output.filepath<-file.path(home.dir,"scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data/clinic_ct_merge_matrix_withgex.RDS")
saveRDS(cmd.out,file=output.filepath,refhook = NULL)
output.filepath<-file.path(home.dir,"scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data/clinic_ct_merge_matrix_withgex.txt")
write.table(cmd.out,file=output.filepath,sep="\t",row.names=F,col.names=T,quote=F,append=F)
# change the file access right so that these two files won't be changed accidentally
output.filepath<-file.path(home.dir,"scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data/clinic_ct_merge_matrix_withgex.RDS")
system(paste("chmod a-w ",output.filepath,sep=""))
output.filepath<-file.path(home.dir,"scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data/clinic_ct_merge_matrix_withgex.txt")
system(paste("chmod a-w ",output.filepath,sep=""))Also generate the clinical matrix file.
# These codes only need to be run once.
clinic.filepath<-"/home/yanxiting/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data/clinic_matrix_merged.RDS"
clinic.matrix<-readRDS(clinic.filepath,refhook = NULL)
output.filepath<-"/home/yanxiting/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data/clinic_matrix_merged.txt"
write.table(clinic.matrix,file=output.filepath,sep="\t",row.names=F,col.names=T,quote=F,append=F)We generate input files for the differential expressed genes between nonacute stage I untreated vs all other groups based on Laura’s observations.
# output the merged clinical data matrix as a txt file
clinic.filepath<-"/home/yanxiting/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data/clinic_matrix_merged.RDS"
clinic.matrix<-readRDS(clinic.filepath,refhook = NULL)
#output.filepath<-"/home/yanxiting/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data/clinic_matrix_merged.txt"
#write.table(clinic.matrix,file=output.filepath,sep="\t",row.names=F,col.names=T,quote=F,append=F)
# note this function is slightly different for DESeq2 Norm data and the TPM matrix
my_gsea_filecreate_binary<-function(fpkm.filepath,clinic.filepath,var.names,group1.values,group2.values,group1.name="group1",group2.name="group2",output.dir,suffix.name){
#####################################################################################################################################################
# Arguments:
#
# gexp.filepath is the file path of the TPM matrix with first 5 columns being annotation for genes
# (TPM_baseline_276_clean_celldiffadjusted_withannot.txt).
#
# clinic.filpath is the file path of the clinic data (clinic_matrix_merged.txt). The first row needs to be the column names.
#
# var.names is a vector containing the names of columns in clinic.filepath that you want to generate the phenotype file for.
#
# group1.values is a data.frame of string describing values of var.names that are considered as gorup 1.
# For example, group1.values=data.frame(PHENGRP="nonacute Stage I untreated"). Note that the columns in group1.values need to match those in var.names if there are
# more than 1 variable to define the groups.
#
# group2.values has the same format as group1.values but is the settings for group2.
#
# group1.name is the name you want to call your group1 in the cls file.
#
# group2.name is the name you want to call your group2 in the cls file.
#
# output.dir is the folder name where you want to save the created files.
#
# suffix.name is the suffix name in the name of the output files. The gct file will be named as gct_suffix.name.gct and cls file will be named as
# cls_suffix.name.gct. For example, if suffix.name="test", the gct file will be named "gct_test.gct" and the cls file will be named as
# "cls_test.cls".
#
# Value:
#
# This function will return 0 if successfully ran. Otherwise, it will return 1. If unsuccessful, information regarding what went wrong will be spit
# out as warning messages.
#
# When ran successfully, this function will create two files under the specified output.dir named gct_var.name.gct and cls_var.name.cls, where
# var.name will be the speficied value
# in the argument. These two files can be directly loaded into GSEA together with another gene set file that needs to be generated outside of this
# function.
#####################################################################################################################################################
# generate the gene expression matrix input file for GSEA
#fpkm.filepath<-file.path(home.dir,"scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data_adjusted2","TPM_baseline_276_clean_celldiffadjusted_withannot.txt")
#clinic.filepath<-"/home/yanxiting/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data/clinic_matrix_merged.txt"
#output.dir<-"/home/yanxiting/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/GSEA_knowngenes"
# suffix.name<-"analysis1"
# load in the fpkm matrix
fpkm.matrix<-read.table(fpkm.filepath,sep="\t",header=T,check.names=F,as.is=TRUE,comment.char = "")
fpkm.matrix.anno<-fpkm.matrix[,1:6]
fpkm.matrix<-fpkm.matrix[,7:ncol(fpkm.matrix)]
# load in the clinical data and subset the samples to those listed in the fpkm.matrix
clinic.matrix=read.table(clinic.filepath,sep="\t",header=T,check.names=F,stringsAsFactors=F)
rownames(clinic.matrix)<-as.matrix(clinic.matrix)[,1] # GRADS ID
clinic.matrix<-clinic.matrix[colnames(fpkm.matrix),]
# based on var.name, group1.values, and group2.values, identify the samples to include in the files.
if(file.exists(output.dir)==F){
dir.create(output.dir)
}
gct.filepath<-file.path(output.dir,paste("gexp_",suffix.name,".gct",sep=""))
cls.filepath<-file.path(output.dir,paste("cls_",suffix.name,".cls",sep=""))
# substract samples with var.name equal to the values in group1.values and group2. values
temp1<-fpkm.matrix[,apply(as.data.frame(clinic.matrix[,var.names]),1,paste,collapse="_")%in%apply(group1.values,1,paste,collapse="_")]
#cat("\n")
#cat(apply(as.data.frame(clinic.matrix[,var.names]),1,paste,collapse="_"))
#cat("\n")
temp2<-fpkm.matrix[,apply(as.data.frame(clinic.matrix[,var.names]),1,paste,collapse="_")%in%apply(group2.values,1,paste,collapse="_")]
cat("There are ",ncol(temp1)," and ", ncol(temp2)," samples for group1 and group2, respectively.\n")
data.matrix<-cbind(temp1,temp2)
data.matrix<-cbind(fpkm.matrix.anno[,1],rep("na",nrow(data.matrix)),data.matrix)
colnames(data.matrix)[1:2]<-c("NAME","Description")
pheno.vect<-c(rep(0,ncol(temp1)),rep(1,ncol(temp2)))
# output the gene expression data
cmd.out<-"#1.2\n"
cmd.out<-paste(cmd.out,nrow(data.matrix),"\t",ncol(data.matrix)-2,"\n",sep="")
cat(cmd.out,file=gct.filepath,append=F)
write.table(data.matrix,sep="\t",file=gct.filepath,append=T,row.names=F,col.names=T,quote=F)
# output the phenotype file
cmd.out<-paste(ncol(data.matrix)-2," ",2," ",1,"\n",sep="")
cmd.out<-paste(cmd.out,"# group1 group2\n",sep="")
cat(cmd.out,file=cls.filepath,append=F,sep="")
cat(pheno.vect,file=cls.filepath,append=T,sep=" ")
results<-list(group1=temp1,group2=temp2)
return(results)
}
fpkm.file=file.path(home.dir,"scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data_adjusted2","DESeq2_normalized_276_clean_log2_celldiffadjusted_withannot.txt")
clinic.file<-"/home/yanxiting/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data/clinic_matrix_merged.txt"
output.folder<-"/home/yanxiting/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/GSEA_knowngenes_DESeq2_log2adjusted"
# generate the files for analysis 1
varnames<-c("PHENGRP")
temp<-my_gsea_filecreate_binary(fpkm.filepath=fpkm.file,
clinic.filepath = clinic.file,
var.names = varnames,
group1.values = data.frame(PHENGRP=c("Non-acute, Stage I, untreated")),
group2.values = data.frame(PHENGRP=c("Stage II-III, untreated","Stage IV, untreated")),
output.dir=output.folder,
suffix.name="analysis1"
)There are 29 and 68 samples for group1 and group2, respectively.
# generate the files for analysis 2
varnames<-c("PHENGRP")
temp<-my_gsea_filecreate_binary(fpkm.filepath=fpkm.file,
clinic.filepath = clinic.file,
var.names = varnames,
group1.values = data.frame(PHENGRP=c("Remitting, untreated")),
group2.values = data.frame(PHENGRP=c("Non-acute, Stage I, untreated","Stage II-III, untreated","Stage IV, untreated")),
output.dir=output.folder,
suffix.name="analysis2"
)There are 40 and 97 samples for group1 and group2, respectively.
# generate the files for analysis 3
varnames<-c("PHENGRP")
temp<-my_gsea_filecreate_binary(fpkm.filepath=fpkm.file,
clinic.filepath = clinic.file,
var.names = varnames,
group1.values = data.frame(PHENGRP=c("Stage IV, untreated")),
group2.values = data.frame(PHENGRP=c("Non-acute, Stage I, untreated","Stage II-III, untreated")),
output.dir=output.folder,
suffix.name="analysis3"
)There are 23 and 74 samples for group1 and group2, respectively.
# generate the files for analysis 4. If we directly specify group1.values using numbers, it won't work. I had to specify them to be a space+number as a character.
varnames<-c("nodule_lymph_pheno","steroid_atv1","dmard_atv1")
temp<-my_gsea_filecreate_binary(fpkm.filepath=fpkm.file,
clinic.filepath = clinic.file,
var.names = varnames,
group1.values = data.frame(nodule_lymph_pheno=c("lymph"),steroid_atv1=c(" 0"),dmard_atv1=c(" 0")),
group2.values = data.frame(nodule_lymph_pheno=c("micronodule","both"),steroid_atv1=c(" 0"," 0"),dmard_atv1=c(" 0"," 0")),
output.dir=output.folder,
suffix.name="analysis4"
)There are 43 and 75 samples for group1 and group2, respectively.
# generate the files for analysis 5.
varnames<-c("PHENGRP")
temp<-my_gsea_filecreate_binary(fpkm.filepath=fpkm.file,
clinic.filepath = clinic.file,
var.names = varnames,
group1.values = data.frame(PHENGRP=c("Remitting, untreated")),
group2.values = data.frame(PHENGRP=c("Stage II-III, untreated")),
output.dir=output.folder,
suffix.name="analysis5"
)There are 40 and 45 samples for group1 and group2, respectively.
# generate the files for analysis 4a. If we directly specify group1.values using numbers, it won't work. I had to specify them to be a space+number as a character.
clinic.filepath<-"/home/yanxiting/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data/clinic_ct_merge_matrix_withgex.RDS"
clinic.matrix<-readRDS(clinic.filepath,refhook = NULL)
clinic.file<-"/home/yanxiting/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data/clinic_ct_merge_matrix_withgex.txt"
varnames<-c("nodule_lymph_pheno","steroid_atv1","dmard_atv1","Ground_Glass","Honeycombing","Reticular_Abnormality")
temp<-my_gsea_filecreate_binary(fpkm.filepath=fpkm.file,
clinic.filepath = clinic.file,
var.names = varnames,
group1.values = data.frame(nodule_lymph_pheno=c("lymph"),steroid_atv1=c(" 0"),dmard_atv1=c(" 0"),Ground_Glass=c(" 0"),Honeycombing=c(" 0"),Reticular_Abnormality=c(" 0")),
group2.values = data.frame(nodule_lymph_pheno=c("micronodule","both"),steroid_atv1=c(" 0"," 0"),dmard_atv1=c(" 0"," 0"),Ground_Glass=c(" 0"," 0"),Honeycombing=c(" 0"," 0"),Reticular_Abnormality=c(" 0"," 0")),
output.dir=output.folder,
suffix.name="analysis4a"
)There are 31 and 38 samples for group1 and group2, respectively.
# generate the files for analysis 4b. If we directly specify group1.values using numbers, it won't work. I had to specify them to be a space+number as a character.
clinic.filepath<-"/home/yanxiting/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data/clinic_ct_merge_matrix_withgex.RDS"
clinic.matrix<-readRDS(clinic.filepath,refhook = NULL)
clinic.file<-"/home/yanxiting/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data/clinic_ct_merge_matrix_withgex.txt"
varnames<-c("nodule_lymph_pheno","steroid_atv1","dmard_atv1","Ground_Glass","Honeycombing","Reticular_Abnormality")
temp<-my_gsea_filecreate_binary(fpkm.filepath=fpkm.file,
clinic.filepath = clinic.file,
var.names = varnames,
group1.values = data.frame(nodule_lymph_pheno=c("lymph"),steroid_atv1=c(" 0"),dmard_atv1=c(" 0"),Ground_Glass=c(" 0"),Honeycombing=c(" 0"),Reticular_Abnormality=c(" 0")),
group2.values = data.frame(nodule_lymph_pheno=c("micronodule"),steroid_atv1=c(" 0"),dmard_atv1=c(" 0"),Ground_Glass=c(" 0"),Honeycombing=c(" 0"),Reticular_Abnormality=c(" 0")),
output.dir=output.folder,
suffix.name="analysis4b"
)There are 31 and 12 samples for group1 and group2, respectively.
We also generate GSEA input files for patients from given lists including the following:
These three analysis is to see if the overlapped patients drove the significant results.
grads.id.list<-file.path(home.dir,"scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/GSEA_knowngenes_DESeq2_log2adjusted/gradsid_overlap_nonoverlap.xlsx")
grads.id.matrix<-read.xls(grads.id.list,sheet=1)
overlap.id<-as.character(grads.id.matrix[,1])[as.character(grads.id.matrix[,1])!=""]
stage1only.id<-as.character(grads.id.matrix[,2])[as.character(grads.id.matrix[,2])!=""]
lymphonly.id<-as.character(grads.id.matrix[,3])[as.character(grads.id.matrix[,3])!=""]
#----------------------------------------------------------------------------------------------------------------
# Part 1: 12 stage I, untreated and lymph only patients
#----------------------------------------------------------------------------------------------------------------
# load in the data
fpkm.file=file.path(home.dir,"scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data_adjusted2","DESeq2_normalized_276_clean_log2_celldiffadjusted_withannot.txt")
clinic.file<-"/home/yanxiting/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data/clinic_matrix_merged.txt"
output.folder<-"/home/yanxiting/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/GSEA_knowngenes_DESeq2_log2adjusted"
suffix.name<-"analysis6"
var.names=c("PHENGRP")
group2.values = data.frame(PHENGRP=c("Stage II-III, untreated","Stage IV, untreated"))
# load in the fpkm matrix
fpkm.matrix<-read.table(fpkm.file,sep="\t",header=T,check.names=F,as.is=TRUE,comment.char = "")
fpkm.matrix.anno<-fpkm.matrix[,1:6]
fpkm.matrix<-fpkm.matrix[,7:ncol(fpkm.matrix)]
# load in the clinical data and subset the samples to those listed in the fpkm.matrix
clinic.matrix=read.table(clinic.file,sep="\t",header=T,check.names=F,stringsAsFactors=F)
rownames(clinic.matrix)<-as.matrix(clinic.matrix)[,1] # GRADS ID
clinic.matrix<-clinic.matrix[colnames(fpkm.matrix),]
# based on var.name, group1.values, and group2.values, identify the samples to include in the files.
if(file.exists(output.folder)==F){
dir.create(output.folder)
}
gct.filepath<-file.path(output.folder,paste("gexp_",suffix.name,".gct",sep=""))
cls.filepath<-file.path(output.folder,paste("cls_",suffix.name,".cls",sep=""))
# substract samples with var.name equal to the values in group1.values and group2. values
#temp1<-fpkm.matrix[,apply(as.data.frame(clinic.matrix[,var.names]),1,paste,collapse="_")%in%apply(group1.values,1,paste,collapse="_")]
temp1<-fpkm.matrix[,overlap.id]
temp2<-fpkm.matrix[,apply(as.data.frame(clinic.matrix[,var.names]),1,paste,collapse="_")%in%apply(group2.values,1,paste,collapse="_")]
cat("There are ",ncol(temp1)," and ", ncol(temp2)," samples for group1 and group2, respectively.\n")There are 12 and 68 samples for group1 and group2, respectively.
data.matrix<-cbind(temp1,temp2)
data.matrix<-cbind(fpkm.matrix.anno[,1],rep("na",nrow(data.matrix)),data.matrix)
colnames(data.matrix)[1:2]<-c("NAME","Description")
pheno.vect<-c(rep(0,ncol(temp1)),rep(1,ncol(temp2)))
# output the gene expression data
cmd.out<-"#1.2\n"
cmd.out<-paste(cmd.out,nrow(data.matrix),"\t",ncol(data.matrix)-2,"\n",sep="")
cat(cmd.out,file=gct.filepath,append=F)
write.table(data.matrix,sep="\t",file=gct.filepath,append=T,row.names=F,col.names=T,quote=F)
# output the phenotype file
cmd.out<-paste(ncol(data.matrix)-2," ",2," ",1,"\n",sep="")
cmd.out<-paste(cmd.out,"# group1 group2\n",sep="")
cat(cmd.out,file=cls.filepath,append=F,sep="")
cat(pheno.vect,file=cls.filepath,append=T,sep=" ")
#----------------------------------------------------------------------------------------------------------------
# part 2: stage I only
#----------------------------------------------------------------------------------------------------------------
# load in the data
fpkm.file=file.path(home.dir,"scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data_adjusted2","DESeq2_normalized_276_clean_log2_celldiffadjusted_withannot.txt")
clinic.file<-"/home/yanxiting/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data/clinic_matrix_merged.txt"
output.folder<-"/home/yanxiting/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/GSEA_knowngenes_DESeq2_log2adjusted"
suffix.name<-"analysis7"
var.names=c("PHENGRP")
group2.values = data.frame(PHENGRP=c("Stage II-III, untreated","Stage IV, untreated"))
# load in the fpkm matrix
fpkm.matrix<-read.table(fpkm.file,sep="\t",header=T,check.names=F,as.is=TRUE,comment.char = "")
fpkm.matrix.anno<-fpkm.matrix[,1:6]
fpkm.matrix<-fpkm.matrix[,7:ncol(fpkm.matrix)]
# load in the clinical data and subset the samples to those listed in the fpkm.matrix
clinic.matrix=read.table(clinic.file,sep="\t",header=T,check.names=F,stringsAsFactors=F)
rownames(clinic.matrix)<-as.matrix(clinic.matrix)[,1] # GRADS ID
clinic.matrix<-clinic.matrix[colnames(fpkm.matrix),]
# based on var.name, group1.values, and group2.values, identify the samples to include in the files.
if(file.exists(output.folder)==F){
dir.create(output.folder)
}
gct.filepath<-file.path(output.folder,paste("gexp_",suffix.name,".gct",sep=""))
cls.filepath<-file.path(output.folder,paste("cls_",suffix.name,".cls",sep=""))
# substract samples with var.name equal to the values in group1.values and group2. values
#temp1<-fpkm.matrix[,apply(as.data.frame(clinic.matrix[,var.names]),1,paste,collapse="_")%in%apply(group1.values,1,paste,collapse="_")]
temp1<-fpkm.matrix[,stage1only.id]
temp2<-fpkm.matrix[,apply(as.data.frame(clinic.matrix[,var.names]),1,paste,collapse="_")%in%apply(group2.values,1,paste,collapse="_")]
cat("There are ",ncol(temp1)," and ", ncol(temp2)," samples for group1 and group2, respectively.\n")There are 17 and 68 samples for group1 and group2, respectively.
data.matrix<-cbind(temp1,temp2)
data.matrix<-cbind(fpkm.matrix.anno[,1],rep("na",nrow(data.matrix)),data.matrix)
colnames(data.matrix)[1:2]<-c("NAME","Description")
pheno.vect<-c(rep(0,ncol(temp1)),rep(1,ncol(temp2)))
# output the gene expression data
cmd.out<-"#1.2\n"
cmd.out<-paste(cmd.out,nrow(data.matrix),"\t",ncol(data.matrix)-2,"\n",sep="")
cat(cmd.out,file=gct.filepath,append=F)
write.table(data.matrix,sep="\t",file=gct.filepath,append=T,row.names=F,col.names=T,quote=F)
# output the phenotype file
cmd.out<-paste(ncol(data.matrix)-2," ",2," ",1,"\n",sep="")
cmd.out<-paste(cmd.out,"# group1 group2\n",sep="")
cat(cmd.out,file=cls.filepath,append=F,sep="")
cat(pheno.vect,file=cls.filepath,append=T,sep=" ")
#----------------------------------------------------------------------------------------------------------------
# part 3: lymph only
#----------------------------------------------------------------------------------------------------------------
# load in the data
fpkm.file=file.path(home.dir,"scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data_adjusted2","DESeq2_normalized_276_clean_log2_celldiffadjusted_withannot.txt")
clinic.file<-"/home/yanxiting/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data/clinic_matrix_merged.txt"
output.folder<-"/home/yanxiting/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/GSEA_knowngenes_DESeq2_log2adjusted"
suffix.name<-"analysis8"
var.names=c("PHENGRP")
group2.values = data.frame(PHENGRP=c("Stage II-III, untreated","Stage IV, untreated"))
# load in the fpkm matrix
fpkm.matrix<-read.table(fpkm.file,sep="\t",header=T,check.names=F,as.is=TRUE,comment.char = "")
fpkm.matrix.anno<-fpkm.matrix[,1:6]
fpkm.matrix<-fpkm.matrix[,7:ncol(fpkm.matrix)]
# load in the clinical data and subset the samples to those listed in the fpkm.matrix
clinic.matrix=read.table(clinic.file,sep="\t",header=T,check.names=F,stringsAsFactors=F)
rownames(clinic.matrix)<-as.matrix(clinic.matrix)[,1] # GRADS ID
clinic.matrix<-clinic.matrix[colnames(fpkm.matrix),]
# based on var.name, group1.values, and group2.values, identify the samples to include in the files.
if(file.exists(output.folder)==F){
dir.create(output.folder)
}
gct.filepath<-file.path(output.folder,paste("gexp_",suffix.name,".gct",sep=""))
cls.filepath<-file.path(output.folder,paste("cls_",suffix.name,".cls",sep=""))
# substract samples with var.name equal to the values in group1.values and group2. values
#temp1<-fpkm.matrix[,apply(as.data.frame(clinic.matrix[,var.names]),1,paste,collapse="_")%in%apply(group1.values,1,paste,collapse="_")]
temp1<-fpkm.matrix[,lymphonly.id]
temp2<-fpkm.matrix[,apply(as.data.frame(clinic.matrix[,var.names]),1,paste,collapse="_")%in%apply(group2.values,1,paste,collapse="_")]
cat("There are ",ncol(temp1)," and ", ncol(temp2)," samples for group1 and group2, respectively.\n")There are 33 and 68 samples for group1 and group2, respectively.
data.matrix<-cbind(temp1,temp2)
data.matrix<-cbind(fpkm.matrix.anno[,1],rep("na",nrow(data.matrix)),data.matrix)
colnames(data.matrix)[1:2]<-c("NAME","Description")
pheno.vect<-c(rep(0,ncol(temp1)),rep(1,ncol(temp2)))
# output the gene expression data
cmd.out<-"#1.2\n"
cmd.out<-paste(cmd.out,nrow(data.matrix),"\t",ncol(data.matrix)-2,"\n",sep="")
cat(cmd.out,file=gct.filepath,append=F)
write.table(data.matrix,sep="\t",file=gct.filepath,append=T,row.names=F,col.names=T,quote=F)
# output the phenotype file
cmd.out<-paste(ncol(data.matrix)-2," ",2," ",1,"\n",sep="")
cmd.out<-paste(cmd.out,"# group1 group2\n",sep="")
cat(cmd.out,file=cls.filepath,append=F,sep="")
cat(pheno.vect,file=cls.filepath,append=T,sep=" ")
#----------------------------------------------------------------------------------------------------------------We examine the CT features of these three groups of patients.
# load in the list of GRADS IDs in all three groups
grads.id.list<-file.path(home.dir,"scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/GSEA_knowngenes_DESeq2_log2adjusted/gradsid_overlap_nonoverlap.xlsx")
grads.id.matrix<-read.xls(grads.id.list,sheet=1)
overlap.id<-as.character(grads.id.matrix[,1])[as.character(grads.id.matrix[,1])!=""]
stage1only.id<-as.character(grads.id.matrix[,2])[as.character(grads.id.matrix[,2])!=""]
lymphonly.id<-as.character(grads.id.matrix[,3])[as.character(grads.id.matrix[,3])!=""]
# load in the clinical data
clinic.filepath<-file.path(home.dir,"scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data/clinic_matrix_merged.RDS")
clinic.matrix<-readRDS(clinic.filepath,refhook = NULL)
clinic.matrix<-clinic.matrix[,c("GENDER","RACE","ethn","AGE","ethor","wbc","cd4","cal","d25","d125","crp","p_lymph","p_mono","p_neut","p_eos","p_baso","FVCPRED","FEV1PRED","PREDDLCO","SCADDING","smoke","pk_yr","steroid_atv1","dmard_atv1","Micronodule","Med_Lymphadenopathy","Hilar_Lymphadenopathy","nodule_lymph_pheno")]
clinic.matrix<-clinic.matrix[c(overlap.id,stage1only.id,lymphonly.id),]
# generate the summary table
var.names<-c("GENDER","RACE","AGE","ethor","wbc","cd4","cal","d25","d125","crp","p_lymph","p_mono","p_neut","p_eos","p_baso","FVCPRED","FEV1PRED","PREDDLCO","SCADDING","smoke","pk_yr","steroid_atv1","dmard_atv1","Micronodule","Med_Lymphadenopathy","Hilar_Lymphadenopathy","nodule_lymph_pheno")
var.names.cat<-c("GENDER","RACE","ethor","smoke","steroid_atv1","dmard_atv1","SCADDING","Micronodule","Med_Lymphadenopathy","Hilar_Lymphadenopathy","nodule_lymph_pheno")
var.names.con<-c("AGE","wbc","cd4","cal","d25","d125","crp","p_lymph","p_mono","p_neut","p_eos","p_baso","FVCPRED","FEV1PRED","PREDDLCO","pk_yr")
var.type<-rep("con",length(var.names))
var.type[var.names%in%var.names.cat]<-"cat"
var.annot<-var.names
summary.table<-character()
for(i in 1:length(var.names)){
if(var.type[i]=="cat"){
temp.matrix<-as.matrix(table(clinic.matrix[overlap.id,var.names[i]]))
rownames(temp.matrix)<-paste(var.annot[i]," - ",rownames(temp.matrix))
temp.sum<-sum(temp.matrix)
temp.matrix2<-temp.matrix/temp.sum
summary.table<-rbind(summary.table,as.matrix(paste(temp.matrix,"(",round(temp.matrix2,digits=2),")",sep="")))
rownames(summary.table)[(nrow(summary.table)-nrow(temp.matrix)+1):nrow(summary.table)]<-rownames(temp.matrix)
}else{
temp.mean<-mean(clinic.matrix[overlap.id,var.names[i]],na.rm=T)
temp.sd<-sd(clinic.matrix[overlap.id,var.names[i]],na.rm=T)
temp.mean<-round(temp.mean,digits=2)
temp.sd<-round(temp.sd,digits=2)
summary.table<-rbind(summary.table,paste(temp.mean,"±",temp.sd,sep=""))
rownames(summary.table)[nrow(summary.table)]<-paste(var.annot[i],", mean±SD",sep="")
}
}
colnames(summary.table)[1]<-"Overlap"
summary.table.total<-summary.table
summary.table<-character()
for(i in 1:length(var.names)){
if(var.type[i]=="cat"){
temp.matrix<-as.matrix(table(clinic.matrix[stage1only.id,var.names[i]]))
rownames(temp.matrix)<-paste(var.annot[i]," - ",rownames(temp.matrix))
temp.sum<-sum(temp.matrix)
temp.matrix2<-temp.matrix/temp.sum
summary.table<-rbind(summary.table,as.matrix(paste(temp.matrix,"(",round(temp.matrix2,digits=2),")",sep="")))
rownames(summary.table)[(nrow(summary.table)-nrow(temp.matrix)+1):nrow(summary.table)]<-rownames(temp.matrix)
}else{
temp.mean<-mean(clinic.matrix[stage1only.id,var.names[i]],na.rm=T)
temp.sd<-sd(clinic.matrix[stage1only.id,var.names[i]],na.rm=T)
temp.mean<-round(temp.mean,digits=2)
temp.sd<-round(temp.sd,digits=2)
summary.table<-rbind(summary.table,paste(temp.mean,"±",temp.sd,sep=""))
rownames(summary.table)[nrow(summary.table)]<-paste(var.annot[i],", mean±SD",sep="")
}
}
colnames(summary.table)[1]<-"Stage I only"
temp.names<-union(rownames(summary.table.total),rownames(summary.table))
temp<-matrix("",nrow=length(temp.names),ncol=2)
rownames(temp)<-temp.names
temp[rownames(summary.table.total),1]<-summary.table.total[,1]
temp[rownames(summary.table),2]<-summary.table[,1]
summary.table.total<-temp
summary.table<-character()
for(i in 1:length(var.names)){
if(var.type[i]=="cat"){
temp.matrix<-as.matrix(table(clinic.matrix[lymphonly.id,var.names[i]]))
rownames(temp.matrix)<-paste(var.annot[i]," - ",rownames(temp.matrix))
temp.sum<-sum(temp.matrix)
temp.matrix2<-temp.matrix/temp.sum
summary.table<-rbind(summary.table,as.matrix(paste(temp.matrix,"(",round(temp.matrix2,digits=2),")",sep="")))
rownames(summary.table)[(nrow(summary.table)-nrow(temp.matrix)+1):nrow(summary.table)]<-rownames(temp.matrix)
}else{
temp.mean<-mean(clinic.matrix[lymphonly.id,var.names[i]],na.rm=T)
temp.sd<-sd(clinic.matrix[lymphonly.id,var.names[i]],na.rm=T)
temp.mean<-round(temp.mean,digits=2)
temp.sd<-round(temp.sd,digits=2)
summary.table<-rbind(summary.table,paste(temp.mean,"±",temp.sd,sep=""))
rownames(summary.table)[nrow(summary.table)]<-paste(var.annot[i],", mean±SD",sep="")
}
}
colnames(summary.table)[1]<-"Lymph only"
temp.names<-union(rownames(summary.table.total),rownames(summary.table))
temp<-matrix("",nrow=length(temp.names),ncol=3)
rownames(temp)<-temp.names
temp[rownames(summary.table.total),1]<-summary.table.total[,1]
temp[rownames(summary.table.total),2]<-summary.table.total[,2]
temp[rownames(summary.table),3]<-summary.table[,1]
summary.table.total<-temp
colnames(summary.table.total)<-c("overlap","Stage 1 only","Lymph only")
# calculate the p values to assess the differences between the three groups of patients for each trait
pvalue.vect<-numeric()
for(i in 1:length(var.names)){
if(var.type[i]=="cat"){
temp.matrix<-as.matrix(table(clinic.matrix[c(overlap.id,stage1only.id,lymphonly.id),var.names[i]]))
rownames(temp.matrix)<-paste(var.annot[i]," - ",rownames(temp.matrix))
temp1<-clinic.matrix[c(overlap.id,stage1only.id,lymphonly.id),var.names[i]]
temp2<-c(rep("overlap",length(overlap.id)),rep("stage1",length(stage1only.id)),rep("lymph",length(lymphonly.id)))
temp.index<-1:length(temp1)
temp.index<-temp.index[!(is.na(temp1) | is.na(temp2))]
temp1<-temp1[temp.index]
temp2<-temp2[temp.index]
if(length(unique(temp1))==1){
pvalue.vect<-c(pvalue.vect, rep(1,length(unique(temp1))))
names(pvalue.vect)[length(pvalue.vect)]<-rownames(temp.matrix)
}else{
pvalue.vect<-c(pvalue.vect, rep(chisq.test(temp1,as.factor(temp2))$p.value,length(unique(temp1))))
names(pvalue.vect)[(length(pvalue.vect)-length(unique(temp1))+1):length(pvalue.vect)]<-rownames(temp.matrix)
}
}else{
temp1<-clinic.matrix[c(overlap.id,stage1only.id,lymphonly.id),var.names[i]]
temp2<-c(rep("overlap",length(overlap.id)),rep("stage1",length(stage1only.id)),rep("lymph",length(lymphonly.id)))
pvalue.vect<-c(pvalue.vect, kruskal.test(temp1~as.factor(temp2))$p.value)
names(pvalue.vect)[length(pvalue.vect)]<-paste(var.annot[i],", mean±SD",sep="")
}
}
summary.table.total<-cbind(summary.table.total,pvalue.vect[rownames(summary.table.total)])
colnames(summary.table.total)[4]<-"p value"
output.filepath<-file.path(home.dir,"scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/GSEA_knowngenes_DESeq2_log2adjusted/gradsid_overlap_nonoverlap_clinicfeatures.xlsx")
write.xlsx(summary.table.total,file=output.filepath,row.names=T,col.names=T,append=F)We also generate the feature table for the CT scan variables.
# load in the list of GRADS IDs in all three groups
grads.id.list<-file.path(home.dir,"scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/GSEA_knowngenes_DESeq2_log2adjusted/gradsid_overlap_nonoverlap.xlsx")
grads.id.matrix<-read.xls(grads.id.list,sheet=1)
overlap.id<-as.character(grads.id.matrix[,1])[as.character(grads.id.matrix[,1])!=""]
stage1only.id<-as.character(grads.id.matrix[,2])[as.character(grads.id.matrix[,2])!=""]
lymphonly.id<-as.character(grads.id.matrix[,3])[as.character(grads.id.matrix[,3])!=""]
# load in the clinical data
ct.filepath<-file.path(home.dir,"scratch/GRADS/SARC_results/Data/CT_data_20170712/Sarc_ct_reads_corrected.xls")
ct.matrix<-read.xls(ct.filepath,sheet=1)
rownames(ct.matrix)<-as.matrix(ct.matrix)[,"GRADSID"]
ct.matrix<-ct.matrix[,c("Med_Lymphadenopathy","Hilar_Lymphadenopathy","Micronodule","Bronchial_Wall_Thickening","Traction_Bronchiectasis","Bronchiectasis_severity","Ground_Glass","Honeycombing","Reticular_Abnormality","Pulmonary_Art","Tree_in_bud")]
ct.matrix<-ct.matrix[c(overlap.id,stage1only.id,lymphonly.id),]
group.vect<-c(rep("overlap",length(overlap.id)),rep("Stage I",length(stage1only.id)),rep("Lymph",length(lymphonly.id)))
# recode Med_Lymphadenopathy and Hilar_Lymphadenopathy
temp<-ct.matrix[,"Med_Lymphadenopathy"]
temp1<-rep(0,length(temp))
temp1[temp%in%c(1,2)]<-1
temp1[temp==3]<-2 # 0=no, 1=one side, 2=billateral
ct.matrix[,"Med_Lymphadenopathy"]<-temp1
temp<-ct.matrix[,"Hilar_Lymphadenopathy"]
temp1<-rep(0,length(temp))
temp1[temp%in%c(1,2)]<-1
temp1[temp==3]<-2 # 0=no, 1=one side, 2=billateral
ct.matrix[,"Hilar_Lymphadenopathy"]<-temp1
# generate the summary table
var.names<-c("Med_Lymphadenopathy","Hilar_Lymphadenopathy","Micronodule","Bronchial_Wall_Thickening","Traction_Bronchiectasis","Bronchiectasis_severity","Ground_Glass","Honeycombing","Reticular_Abnormality","Pulmonary_Art","Tree_in_bud")
summary.table<-character()
for(i in 1:length(var.names)){
temp.matrix<-as.matrix(table(ct.matrix[overlap.id,var.names[i]]))
rownames(temp.matrix)<-paste(var.names[i]," - ",rownames(temp.matrix))
temp.sum<-sum(temp.matrix)
temp.matrix2<-temp.matrix/temp.sum
summary.table<-rbind(summary.table,as.matrix(paste(temp.matrix,"(",round(temp.matrix2,digits=2),")",sep="")))
rownames(summary.table)[(nrow(summary.table)-nrow(temp.matrix)+1):nrow(summary.table)]<-rownames(temp.matrix)
}
colnames(summary.table)[1]<-"Overlap"
summary.table.total<-summary.table
summary.table<-character()
for(i in 1:length(var.names)){
temp.matrix<-as.matrix(table(ct.matrix[stage1only.id,var.names[i]]))
rownames(temp.matrix)<-paste(var.names[i]," - ",rownames(temp.matrix))
temp.sum<-sum(temp.matrix)
temp.matrix2<-temp.matrix/temp.sum
summary.table<-rbind(summary.table,as.matrix(paste(temp.matrix,"(",round(temp.matrix2,digits=2),")",sep="")))
rownames(summary.table)[(nrow(summary.table)-nrow(temp.matrix)+1):nrow(summary.table)]<-rownames(temp.matrix)
}
colnames(summary.table)[1]<-"Stage I"
temp.names<-union(rownames(summary.table.total),rownames(summary.table))
temp<-matrix("",nrow=length(temp.names),ncol=2)
rownames(temp)<-temp.names
temp[rownames(summary.table.total),1]<-summary.table.total[,1]
temp[rownames(summary.table),2]<-summary.table[,1]
summary.table.total<-temp
summary.table<-character()
for(i in 1:length(var.names)){
temp.matrix<-as.matrix(table(ct.matrix[lymphonly.id,var.names[i]]))
rownames(temp.matrix)<-paste(var.names[i]," - ",rownames(temp.matrix))
temp.sum<-sum(temp.matrix)
temp.matrix2<-temp.matrix/temp.sum
summary.table<-rbind(summary.table,as.matrix(paste(temp.matrix,"(",round(temp.matrix2,digits=2),")",sep="")))
rownames(summary.table)[(nrow(summary.table)-nrow(temp.matrix)+1):nrow(summary.table)]<-rownames(temp.matrix)
}
colnames(summary.table)[1]<-"Lymph only"
temp.names<-union(rownames(summary.table.total),rownames(summary.table))
temp<-matrix("",nrow=length(temp.names),ncol=3)
rownames(temp)<-temp.names
temp[rownames(summary.table.total),1]<-summary.table.total[,1]
temp[rownames(summary.table.total),2]<-summary.table.total[,2]
temp[rownames(summary.table),3]<-summary.table[,1]
summary.table.total<-temp
colnames(summary.table.total)<-c("overlap","Stage 1 only","Lymph only")
# calculate the p values to assess the differences between the three groups of patients for each trait
pvalue.vect<-numeric()
for(i in 1:length(var.names)){
temp.matrix<-as.matrix(table(ct.matrix[c(overlap.id,stage1only.id,lymphonly.id),var.names[i]]))
rownames(temp.matrix)<-paste(var.names[i]," - ",rownames(temp.matrix))
temp1<-ct.matrix[c(overlap.id,stage1only.id,lymphonly.id),var.names[i]]
temp2<-c(rep("overlap",length(overlap.id)),rep("stage1",length(stage1only.id)),rep("lymph",length(lymphonly.id)))
temp.index<-1:length(temp1)
temp.index<-temp.index[!(is.na(temp1) | is.na(temp2))]
temp1<-temp1[temp.index]
temp2<-temp2[temp.index]
if(length(unique(temp1))==1){
pvalue.vect<-c(pvalue.vect, rep(1,length(unique(temp1))))
names(pvalue.vect)[length(pvalue.vect)]<-rownames(temp.matrix)
}else{
pvalue.vect<-c(pvalue.vect, rep(chisq.test(temp1,as.factor(temp2))$p.value,length(unique(temp1))))
names(pvalue.vect)[(length(pvalue.vect)-length(unique(temp1))+1):length(pvalue.vect)]<-rownames(temp.matrix)
}
}
summary.table.total<-cbind(summary.table.total,pvalue.vect[rownames(summary.table.total)])
colnames(summary.table.total)[4]<-"p value"
output.filepath<-file.path(home.dir,"scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/GSEA_knowngenes_DESeq2_log2adjusted/gradsid_overlap_nonoverlap_ctfeatures.xlsx")
write.xlsx(summary.table.total,file=output.filepath,row.names=T,col.names=T,append=F)We also generated the GSEA input files for three three groups of patients using the function we wrote.
# output the merged clinical data matrix as a txt file
clinic.filepath<-"/home/yanxiting/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data/clinic_matrix_merged.RDS"
clinic.matrix<-readRDS(clinic.filepath,refhook = NULL)
output.filepath<-"/home/yanxiting/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data/clinic_matrix_merged.txt"
write.table(clinic.matrix,file=output.filepath,sep="\t",row.names=F,col.names=T,quote=F,append=F)
# note this function is slightly different for DESeq2 Norm data and the TPM matrix
my_gsea_filecreate_binary<-function(fpkm.filepath,clinic.filepath,var.names,group1.values,group2.values,group1.name="group1",group2.name="group2",output.dir,suffix.name){
#####################################################################################################################################################
# Arguments:
#
# gexp.filepath is the file path of the TPM matrix with first 5 columns being annotation for genes
# (TPM_baseline_276_clean_celldiffadjusted_withannot.txt).
#
# clinic.filpath is the file path of the clinic data (clinic_matrix_merged.txt). The first row needs to be the column names.
#
# var.names is a vector containing the names of columns in clinic.filepath that you want to generate the phenotype file for.
#
# group1.values is a data.frame of string describing values of var.names that are considered as gorup 1.
# For example, group1.values=data.frame(PHENGRP="nonacute Stage I untreated"). Note that the columns in group1.values need to match those in var.names if there are
# more than 1 variable to define the groups.
#
# group2.values has the same format as group1.values but is the settings for group2.
#
# group1.name is the name you want to call your group1 in the cls file.
#
# group2.name is the name you want to call your group2 in the cls file.
#
# output.dir is the folder name where you want to save the created files.
#
# suffix.name is the suffix name in the name of the output files. The gct file will be named as gct_suffix.name.gct and cls file will be named as
# cls_suffix.name.gct. For example, if suffix.name="test", the gct file will be named "gct_test.gct" and the cls file will be named as
# "cls_test.cls".
#
# Value:
#
# This function will return 0 if successfully ran. Otherwise, it will return 1. If unsuccessful, information regarding what went wrong will be spit
# out as warning messages.
#
# When ran successfully, this function will create two files under the specified output.dir named gct_var.name.gct and cls_var.name.cls, where
# var.name will be the speficied value
# in the argument. These two files can be directly loaded into GSEA together with another gene set file that needs to be generated outside of this
# function.
#####################################################################################################################################################
# generate the gene expression matrix input file for GSEA
#fpkm.filepath<-file.path(home.dir,"scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data_adjusted2","TPM_baseline_276_clean_celldiffadjusted_withannot.txt")
#clinic.filepath<-"/home/yanxiting/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data/clinic_matrix_merged.txt"
#output.dir<-"/home/yanxiting/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/GSEA_knowngenes"
# suffix.name<-"analysis1"
# load in the fpkm matrix
fpkm.matrix<-read.table(fpkm.filepath,sep="\t",header=T,check.names=F,as.is=TRUE,comment.char = "")
fpkm.matrix.anno<-fpkm.matrix[,1:6]
fpkm.matrix<-fpkm.matrix[,7:ncol(fpkm.matrix)]
# load in the clinical data and subset the samples to those listed in the fpkm.matrix
clinic.matrix=read.table(clinic.filepath,sep="\t",header=T,check.names=F,stringsAsFactors=F)
rownames(clinic.matrix)<-as.matrix(clinic.matrix)[,1] # GRADS ID
clinic.matrix<-clinic.matrix[colnames(fpkm.matrix),]
# based on var.name, group1.values, and group2.values, identify the samples to include in the files.
if(file.exists(output.dir)==F){
dir.create(output.dir)
}
gct.filepath<-file.path(output.dir,paste("gexp_",suffix.name,".gct",sep=""))
cls.filepath<-file.path(output.dir,paste("cls_",suffix.name,".cls",sep=""))
# substract samples with var.name equal to the values in group1.values and group2. values
temp1<-fpkm.matrix[,apply(as.data.frame(clinic.matrix[,var.names]),1,paste,collapse="_")%in%apply(group1.values,1,paste,collapse="_")]
temp2<-fpkm.matrix[,apply(as.data.frame(clinic.matrix[,var.names]),1,paste,collapse="_")%in%apply(group2.values,1,paste,collapse="_")]
cat("There are ",ncol(temp1)," and ", ncol(temp2)," samples for group1 and group2, respectively.\n")
data.matrix<-cbind(temp1,temp2)
data.matrix<-cbind(fpkm.matrix.anno[,1],rep("na",nrow(data.matrix)),data.matrix)
colnames(data.matrix)[1:2]<-c("NAME","Description")
pheno.vect<-c(rep(0,ncol(temp1)),rep(1,ncol(temp2)))
# output the gene expression data
cmd.out<-"#1.2\n"
cmd.out<-paste(cmd.out,nrow(data.matrix),"\t",ncol(data.matrix)-2,"\n",sep="")
cat(cmd.out,file=gct.filepath,append=F)
write.table(data.matrix,file=gct.filepath,append=T,row.names=F,col.names=T,quote=F)
# output the phenotype file
cmd.out<-paste(ncol(data.matrix)-2,"\t",2,"\t",1,"\n",sep="")
cmd.out<-paste(cmd.out,"# group1 group2\n",sep="")
cat(cmd.out,file=cls.filepath,append=F,sep="")
cat(pheno.vect,file=cls.filepath,append=T,sep=" ")
return(0)
}
fpkm.file=file.path(home.dir,"scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data_adjusted2","DESeq2_normalized_276_clean_log2_celldiffadjusted_withannot.txt")
clinic.file<-"/home/yanxiting/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data/clinic_matrix_merged.txt"
output.folder<-"/home/yanxiting/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/GSEA_knowngenes_DESeq2_log2adjusted"
# generate the files for analysis 9: overlapped patients
varnames<-c("PHENGRP","nodule_lymph_pheno")
my_gsea_filecreate_binary(fpkm.filepath=fpkm.file,
clinic.filepath = clinic.file,
var.names = varnames,
group1.values = data.frame(PHENGRP=c("Non-acute, Stage I, untreated"),nodule_lymph_pheno=c("lymph")),
group2.values = data.frame(PHENGRP=c("Stage II-III, untreated","Stage II-III, untreated","Stage II-III, untreated","Stage IV, untreated","Stage IV, untreated","Stage IV, untreated"),nodule_lymph_pheno=c("lymph","micronodule","both","lymph","micronodule","both")),
output.dir=output.folder,
suffix.name="analysis9"
)
# generate the files for analysis 10: Stage I only
varnames<-c("PHENGRP","nodule_lymph_pheno")
my_gsea_filecreate_binary(fpkm.filepath=fpkm.file,
clinic.filepath = clinic.file,
var.names = varnames,
group1.values = data.frame(PHENGRP=c("Non-acute, Stage I, untreated","Non-acute, Stage I, untreated"),nodule_lymph_pheno=c("micronodule","both")),
group2.values = data.frame(PHENGRP=c("Stage II-III, untreated","Stage II-III, untreated","Stage II-III, untreated","Stage IV, untreated","Stage IV, untreated","Stage IV, untreated"),nodule_lymph_pheno=c("lymph","micronodule","both","lymph","micronodule","both")),
output.dir=output.folder,
suffix.name="analysis10"
)
# generate the files for analysis 11: lymph only (no stage I)
varnames<-c("PHENGRP","nodule_lymph_pheno")
phen.value<-c("Non-acute, Stage I, untreated","Cardiac defining therapy","Stage IV, treated","Stage IV, untreated","Stage II-III, untreated","Remitting, untreated","Multi-organ", "Stage II-III, treated","Acute Sarcoidosis, untreated")
phen.value<-setdiff(phen.value,"Non-acute, Stage I, untreated")
my_gsea_filecreate_binary(fpkm.filepath=fpkm.file,
clinic.filepath = clinic.file,
var.names = varnames,
group1.values = data.frame(PHENGRP=phen.value,nodule_lymph_pheno=rep("lymph",length(phen.value))),
group2.values = data.frame(PHENGRP=c("Stage II-III, untreated","Stage II-III, untreated","Stage II-III, untreated","Stage IV, untreated","Stage IV, untreated","Stage IV, untreated"),nodule_lymph_pheno=c("lymph","micronodule","both","lymph","micronodule","both")),
output.dir=output.folder,
suffix.name="analysis11"
)We downloaded a previously published 10X Genomicx data and compared the genes in Bloom data to see if they are enriched in cell type markers or which cell types are expressing them.
First, we load in the raw count data and redo the preprocessing using standard Seurat pipeline.
library(Matrix)
count.filepath<-"/home/yanxiting/Documents/Research/GRADS_SARC_PBMC/Data/GSE132338_matrix_counts.mtx"
coldata.filepath<-"/home/yanxiting/Documents/Research/GRADS_SARC_PBMC/Data/GSE132338_matrix_colData.txt"
rowdata.filepath<-"/home/yanxiting/Documents/Research/GRADS_SARC_PBMC/Data/GSE132338_matrix_rowData.mtx"
data.filepath<-"/home/yanxiting/Documents/Research/GRADS_SARC_PBMC/Data/GSE132338_SingleCellExperiment.RDS"
sc.data<-readRDS(data.filepath,refhook=NULL)
count.data<-readMM(count.filepath)
mycol.data<-read.table(coldata.filepath,sep="\t",row.names=1,header=T)
myrow.data<-readLines(rowdata.filepath)
# create a seurat object with the count data and put the integrated data in
rownames(count.data)<-myrow.data[2:length(myrow.data)]
colnames(count.data)<-rownames(mycol.data)
# create a Seurat object using the count matrix
my.data<-CreateSeuratObject(counts=count.data,project="PBMC",min.cells=0,min.features = 0)
my.data[["percent.mt"]] <- PercentageFeatureSet(my.data, pattern = "^MT-")
# Visualize QC metrics as a violin plot
VlnPlot(my.data, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
plot1 <- FeatureScatter(my.data, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(my.data, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
grid.arrange(plot1 , plot2,ncol=2)
my.data <- NormalizeData(my.data, normalization.method = "LogNormalize", scale.factor = 10000)Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
my.data <- FindVariableFeatures(my.data, selection.method = "vst", nfeatures = 2000)Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(my.data), 10)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(my.data)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)When using repel, set xnudge and ynudge to 0 for optimal results
grid.arrange(plot1,plot2,ncol=2)# decide the number of PC to use
my.data <- JackStraw(my.data, dims = 40,num.replicate = 100)
| | 0 % ~calculating
|+ | 1 % ~01h 12m 28s
|+ | 2 % ~01h 12m 49s
|++ | 3 % ~01h 13m 21s
|++ | 4 % ~01h 12m 38s
|+++ | 5 % ~01h 11m 49s
|+++ | 6 % ~01h 12m 03s
|++++ | 7 % ~01h 11m 09s
|++++ | 8 % ~01h 10m 10s
|+++++ | 9 % ~01h 09m 29s
|+++++ | 10% ~01h 08m 22s
|++++++ | 11% ~01h 07m 50s
|++++++ | 12% ~01h 06m 43s
|+++++++ | 13% ~01h 05m 50s
|+++++++ | 14% ~01h 04m 54s
|++++++++ | 15% ~01h 04m 20s
|++++++++ | 16% ~01h 03m 37s
|+++++++++ | 17% ~01h 02m 43s
|+++++++++ | 18% ~01h 01m 58s
|++++++++++ | 19% ~01h 01m 22s
|++++++++++ | 20% ~01h 00m 22s
|+++++++++++ | 21% ~59m 33s
|+++++++++++ | 22% ~58m 50s
|++++++++++++ | 23% ~58m 02s
|++++++++++++ | 24% ~57m 13s
|+++++++++++++ | 25% ~56m 28s
|+++++++++++++ | 26% ~56m 12s
|++++++++++++++ | 27% ~55m 24s
|++++++++++++++ | 28% ~54m 29s
|+++++++++++++++ | 29% ~53m 52s
|+++++++++++++++ | 30% ~53m 08s
|++++++++++++++++ | 31% ~52m 22s
|++++++++++++++++ | 32% ~51m 32s
|+++++++++++++++++ | 33% ~50m 43s
|+++++++++++++++++ | 34% ~50m 01s
|++++++++++++++++++ | 35% ~49m 17s
|++++++++++++++++++ | 36% ~48m 31s
|+++++++++++++++++++ | 37% ~47m 48s
|+++++++++++++++++++ | 38% ~46m 59s
|++++++++++++++++++++ | 39% ~46m 31s
|++++++++++++++++++++ | 40% ~46m 03s
|+++++++++++++++++++++ | 41% ~45m 25s
|+++++++++++++++++++++ | 42% ~44m 38s
|++++++++++++++++++++++ | 43% ~43m 49s
|++++++++++++++++++++++ | 44% ~43m 04s
|+++++++++++++++++++++++ | 45% ~42m 18s
|+++++++++++++++++++++++ | 46% ~41m 28s
|++++++++++++++++++++++++ | 47% ~40m 48s
|++++++++++++++++++++++++ | 48% ~39m 59s
|+++++++++++++++++++++++++ | 49% ~39m 11s
|+++++++++++++++++++++++++ | 50% ~38m 26s
|++++++++++++++++++++++++++ | 51% ~37m 44s
|++++++++++++++++++++++++++ | 52% ~36m 56s
|+++++++++++++++++++++++++++ | 53% ~36m 07s
|+++++++++++++++++++++++++++ | 54% ~35m 24s
|++++++++++++++++++++++++++++ | 55% ~34m 36s
|++++++++++++++++++++++++++++ | 56% ~33m 49s
|+++++++++++++++++++++++++++++ | 57% ~33m 01s
|+++++++++++++++++++++++++++++ | 58% ~32m 12s
|++++++++++++++++++++++++++++++ | 59% ~31m 26s
|++++++++++++++++++++++++++++++ | 60% ~30m 39s
|+++++++++++++++++++++++++++++++ | 61% ~29m 52s
|+++++++++++++++++++++++++++++++ | 62% ~29m 04s
|++++++++++++++++++++++++++++++++ | 63% ~28m 15s
|++++++++++++++++++++++++++++++++ | 64% ~27m 28s
|+++++++++++++++++++++++++++++++++ | 65% ~26m 42s
|+++++++++++++++++++++++++++++++++ | 66% ~25m 55s
|++++++++++++++++++++++++++++++++++ | 67% ~25m 09s
|++++++++++++++++++++++++++++++++++ | 68% ~24m 24s
|+++++++++++++++++++++++++++++++++++ | 69% ~23m 38s
|+++++++++++++++++++++++++++++++++++ | 70% ~22m 51s
|++++++++++++++++++++++++++++++++++++ | 71% ~22m 07s
|++++++++++++++++++++++++++++++++++++ | 72% ~21m 21s
|+++++++++++++++++++++++++++++++++++++ | 73% ~20m 34s
|+++++++++++++++++++++++++++++++++++++ | 74% ~19m 47s
|++++++++++++++++++++++++++++++++++++++ | 75% ~19m 01s
|++++++++++++++++++++++++++++++++++++++ | 76% ~18m 14s
|+++++++++++++++++++++++++++++++++++++++ | 77% ~17m 28s
|+++++++++++++++++++++++++++++++++++++++ | 78% ~16m 42s
|++++++++++++++++++++++++++++++++++++++++ | 79% ~15m 56s
|++++++++++++++++++++++++++++++++++++++++ | 80% ~15m 11s
|+++++++++++++++++++++++++++++++++++++++++ | 81% ~14m 25s
|+++++++++++++++++++++++++++++++++++++++++ | 82% ~13m 40s
|++++++++++++++++++++++++++++++++++++++++++ | 83% ~12m 54s
|++++++++++++++++++++++++++++++++++++++++++ | 84% ~12m 09s
|+++++++++++++++++++++++++++++++++++++++++++ | 85% ~11m 23s
|+++++++++++++++++++++++++++++++++++++++++++ | 86% ~10m 37s
|++++++++++++++++++++++++++++++++++++++++++++ | 87% ~09m 51s
|++++++++++++++++++++++++++++++++++++++++++++ | 88% ~09m 05s
|+++++++++++++++++++++++++++++++++++++++++++++ | 89% ~08m 19s
|+++++++++++++++++++++++++++++++++++++++++++++ | 90% ~07m 34s
|++++++++++++++++++++++++++++++++++++++++++++++ | 91% ~06m 49s
|++++++++++++++++++++++++++++++++++++++++++++++ | 92% ~06m 03s
|+++++++++++++++++++++++++++++++++++++++++++++++ | 93% ~05m 18s
|+++++++++++++++++++++++++++++++++++++++++++++++ | 94% ~04m 32s
|++++++++++++++++++++++++++++++++++++++++++++++++ | 95% ~03m 47s
|++++++++++++++++++++++++++++++++++++++++++++++++ | 96% ~03m 02s
|+++++++++++++++++++++++++++++++++++++++++++++++++ | 97% ~02m 16s
|+++++++++++++++++++++++++++++++++++++++++++++++++ | 98% ~01m 31s
|++++++++++++++++++++++++++++++++++++++++++++++++++| 99% ~45s
|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=01h 15m 37s
| | 0 % ~calculating
|++ | 2 % ~02s
|+++ | 5 % ~02s
|++++ | 8 % ~02s
|+++++ | 10% ~02s
|+++++++ | 12% ~02s
|++++++++ | 15% ~01s
|+++++++++ | 18% ~01s
|++++++++++ | 20% ~01s
|++++++++++++ | 22% ~01s
|+++++++++++++ | 25% ~01s
|++++++++++++++ | 28% ~01s
|+++++++++++++++ | 30% ~01s
|+++++++++++++++++ | 32% ~01s
|++++++++++++++++++ | 35% ~01s
|+++++++++++++++++++ | 38% ~01s
|++++++++++++++++++++ | 40% ~01s
|++++++++++++++++++++++ | 42% ~01s
|+++++++++++++++++++++++ | 45% ~01s
|++++++++++++++++++++++++ | 48% ~01s
|+++++++++++++++++++++++++ | 50% ~01s
|+++++++++++++++++++++++++++ | 52% ~01s
|++++++++++++++++++++++++++++ | 55% ~01s
|+++++++++++++++++++++++++++++ | 58% ~01s
|++++++++++++++++++++++++++++++ | 60% ~01s
|++++++++++++++++++++++++++++++++ | 62% ~01s
|+++++++++++++++++++++++++++++++++ | 65% ~01s
|++++++++++++++++++++++++++++++++++ | 68% ~01s
|+++++++++++++++++++++++++++++++++++ | 70% ~01s
|+++++++++++++++++++++++++++++++++++++ | 72% ~00s
|++++++++++++++++++++++++++++++++++++++ | 75% ~00s
|+++++++++++++++++++++++++++++++++++++++ | 78% ~00s
|++++++++++++++++++++++++++++++++++++++++ | 80% ~00s
|++++++++++++++++++++++++++++++++++++++++++ | 82% ~00s
|+++++++++++++++++++++++++++++++++++++++++++ | 85% ~00s
|++++++++++++++++++++++++++++++++++++++++++++ | 88% ~00s
|+++++++++++++++++++++++++++++++++++++++++++++ | 90% ~00s
|+++++++++++++++++++++++++++++++++++++++++++++++ | 92% ~00s
|++++++++++++++++++++++++++++++++++++++++++++++++ | 95% ~00s
|+++++++++++++++++++++++++++++++++++++++++++++++++ | 98% ~00s
|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=02s
my.data <- ScoreJackStraw(my.data, dims = 1:40)
JackStrawPlot(my.data, dims = 1:40)We decides to use the top 20 PCs.
Draw the UMAP with annotation from GEO.
DimPlot(my.data, reduction = "umap",shuffle=T)The following functions and any applicable methods accept the dots: CombinePlots
Save the Seurat object.
saveRDS(my.data, file ="/home/yanxiting/Documents/Research/GRADS_SARC_PBMC/scRNA_compare/GSE132338_seurat_nointegration.rds")We check the expression of the analysis4b genes in all cell types.
my.distinct.colors<-c('#e6194b' , '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#808080', '#800000', '#808000', '#ffd8b1', '#000075', '#ffffff', '#000000','#fffac8','#aaffc3')
celltype.colors<-my.distinct.colors[1:length(unique(my.data@meta.data$celltype))]
bloom.filepath<-"/home/yanxiting/Documents/Research/GRADS_SARC_PBMC/Data/analysis4b_BLOOM.tsv"
bloom.data<-as.data.frame(read_tsv(bloom.filepath))Parsed with column specification:
cols(
NAME = col_character(),
SYMBOL = col_character(),
TITLE = col_character(),
`RANK IN GENE LIST` = col_double(),
`RANK METRIC SCORE` = col_double(),
`RUNNING ES` = col_double(),
`CORE ENRICHMENT` = col_character(),
X8 = col_logical()
)
bloom.genes<-bloom.data[bloom.data$`CORE ENRICHMENT`=="Yes","SYMBOL"]
temp.names<-intersect(bloom.genes,rownames(sc.data@assays$integrated@counts))
#output.filepath<-"~/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/scRNA_compare/featureplot_bloom_4b.pdf"
#output.filepath<-"/home/yanxiting/Documents/Research/GRADS_SARC_PBMC/scRNA_compare/featureplot_bloom_4b.pdf"
#pdf(output.filepath,width = 8,height=8,onefile = T)
#g.list<-list()
for(i in 1:length(temp.names)){
#temp.data<-my.data@meta.data
#g.1<-ggplot(temp.data,aes(x=UMAP_1,y=UMAP_2,colour=celltype))+geom_point(size=0.1,shape=19)+scale_color_manual(values=celltype.colors)+guides(color = guide_legend(override.aes = list(size = 8)))
Idents(my.data)<-my.data@meta.data$celltype
g.1<-DimPlot(my.data, reduction = "umap",cols=celltype.colors,shuffle=T)
g.2<-FeaturePlot(my.data,features=temp.names[i],pt.size = 0.1,order=T)
g<-grid.arrange(g.1,g.2,ncol=2)
output.filepath<-file.path("/home/yanxiting/Documents/Research/GRADS_SARC_PBMC/scRNA_compare",paste("featureplot_bloom_4b_",temp.names[i],".pdf"))
ggsave(output.filepath,g,device="pdf",width=10,height=5)
}The following functions and any applicable methods accept the dots: CombinePlots
The following functions and any applicable methods accept the dots: CombinePlots
The following functions and any applicable methods accept the dots: CombinePlots
The following functions and any applicable methods accept the dots: CombinePlots
The following functions and any applicable methods accept the dots: CombinePlots
The following functions and any applicable methods accept the dots: CombinePlots
The following functions and any applicable methods accept the dots: CombinePlots
The following functions and any applicable methods accept the dots: CombinePlots
The following functions and any applicable methods accept the dots: CombinePlots
The following functions and any applicable methods accept the dots: CombinePlots
The following functions and any applicable methods accept the dots: CombinePlots
The following functions and any applicable methods accept the dots: CombinePlots
The following functions and any applicable methods accept the dots: CombinePlots
The following functions and any applicable methods accept the dots: CombinePlots
The following functions and any applicable methods accept the dots: CombinePlots
The following functions and any applicable methods accept the dots: CombinePlots
The following functions and any applicable methods accept the dots: CombinePlots
The following functions and any applicable methods accept the dots: CombinePlots
The following functions and any applicable methods accept the dots: CombinePlots
The following functions and any applicable methods accept the dots: CombinePlots
The following functions and any applicable methods accept the dots: CombinePlots
The following functions and any applicable methods accept the dots: CombinePlots
The following functions and any applicable methods accept the dots: CombinePlots
Conduct integrated analysis to remove donor effect. We have not succeeded in the integration analysis due to the very small number of cells per subject.
# split the dataset into a list of two seurat objects (stim and CTRL)
ifnb.list <- SplitObject(my.data, split.by = "donor")
temp.size<-matrix(unlist(lapply(ifnb.list,dim)),ncol=2,byrow=T)[,2]
#ifnb.list<-ifnb.list[temp.size>=30]
# normalize and identify variable features for each dataset independently
ifnb.list <- lapply(X = ifnb.list, FUN = function(x) {
x <- NormalizeData(x)
x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
# select features that are repeatedly variable across datasets for integration
features <- SelectIntegrationFeatures(object.list = ifnb.list)
ifnb.list <- lapply(X = ifnb.list, FUN = function(x) {
x <- ScaleData(x, features = features, verbose = FALSE)
x <- RunPCA(x, features = features, verbose = FALSE, approx=FALSE)
})Error in pca.results$x %*% diag(pca.results$sdev[1:npcs]^2) :
non-conformable arguments
# left over
my.data@assays$integrated<-sc.data@assays$integrated
my.data@meta.data<-cbind(my.data@meta.data,mycol.data)
Idents(my.data)<-my.data@meta.data$celltype
# identify the cell type marker genes
#Idents(sc.data)<-sc.data@meta.data$celltype
celltype.names<-unique(as.character(my.data@meta.data$celltype))
sc.markers.result<-list()
for(i in 1:length(celltype.names)){
sc.markers.result[[i]]<-FindMarkers(my.data,ident.1=celltype.names[i],min.pct=0.05)
}
# save the marker lists
#output.filepath<-file.path("~/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/data/GSE132338_SingleCellExperiment_markerlist.RDS")
output.filepath<-"/home/yanxiting/Documents/Research/GRADS_SARC_PBMC/scRNA_compare/GSE132338_SingleCellExperiment_markerlist.RDS"
saveRDS(sc.markers.result,file=output.filepath,refhook = NULL)Draw the UMAPs to show the expression of bloom genes in this 10X data.
#bloom.filepath<-"~/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/scRNA_compare/analysis4b_BLOOM.tsv"
my.distinct.colors<-c('#e6194b' , '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#808080', '#800000', '#808000', '#ffd8b1', '#000075', '#ffffff', '#000000','#fffac8','#aaffc3')
celltype.colors<-my.distinct.colors[1:length(unique(my.data@meta.data$celltype))]
bloom.filepath<-"/home/yanxiting/Documents/Research/GRADS_SARC_PBMC/Data/analysis4b_BLOOM.tsv"
bloom.data<-as.data.frame(read_tsv(bloom.filepath))
bloom.genes<-bloom.data[bloom.data$`CORE ENRICHMENT`=="Yes","SYMBOL"]
temp.names<-intersect(bloom.genes,rownames(sc.data@assays$integrated@counts))
#output.filepath<-"~/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/scRNA_compare/featureplot_bloom_4b.pdf"
output.filepath<-"/home/yanxiting/Documents/Research/GRADS_SARC_PBMC/scRNA_compare/featureplot_bloom_4b.pdf"
pdf(output.filepath,width = 8,height=8,onefile = T)
#g.list<-list()
for(i in 1:length(temp.names)){
temp.data<-my.data@meta.data
g.1<-ggplot(temp.data,aes(x=UMAP_1,y=UMAP_2,colour=celltype))+geom_point(size=0.1,shape=19)+scale_color_manual(values=celltype.colors)+guides(color = guide_legend(override.aes = list(size = 8)))
temp.color<-sc.data@assays$integrated@counts[temp.names[i],]
temp.data<-cbind(sc.data@meta.data,temp.color)
colnames(temp.data)[ncol(temp.data)]<-"col"
g.2<-ggplot(temp.data,aes(x=UMAP_1,y=UMAP_2,colour=col))+geom_point(size=0.01,shape=19)+scale_color_gradient(low="grey",high="red")+ggtitle(temp.names[i])
#output.filepath<-file.path("~/driver_Grace/scratch/GRADS/SARC_results/Results_summary_PBMC_hg38/baseline/scRNA_compare",paste("featureplot_bloom_4b_",temp.names[i],".pdf"))
g<-grid.arrange(g.1,g.2,ncol=2)
g
output.filepath<-file.path("/home/yanxiting/Documents/Research/GRADS_SARC_PBMC/scRNA_compare",paste("featureplot_bloom_4b_",temp.names[i],".pdf"))
ggsave(output.filepath,g,device="pdf",width=8,height=8)
}
dev.off()First, we evaluate the enrichment of BLOOM genes in the marker gene lists